{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 波士顿房价预测案例——线性回归分析\n",
    "\n",
    "比较对y和logy进行线性拟合的效果。"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1、导入必要的工具包"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import numpy as np  # 矩阵操作\n",
    "import pandas as pd # SQL数据处理\n",
    "\n",
    "from sklearn.metrics import r2_score  #评价回归预测模型的性能"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2. 读取数据\n",
    "已经是做完特征工程后的数据，请先运行2_FE_BostonHousePrice.ipynb，得到文件FE_boston_housing.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>CRIM</th>\n",
       "      <th>ZN</th>\n",
       "      <th>INDUS</th>\n",
       "      <th>CHAS</th>\n",
       "      <th>NOX</th>\n",
       "      <th>RM</th>\n",
       "      <th>AGE</th>\n",
       "      <th>DIS</th>\n",
       "      <th>TAX</th>\n",
       "      <th>PTRATIO</th>\n",
       "      <th>...</th>\n",
       "      <th>RAD_2</th>\n",
       "      <th>RAD_3</th>\n",
       "      <th>RAD_4</th>\n",
       "      <th>RAD_5</th>\n",
       "      <th>RAD_6</th>\n",
       "      <th>RAD_7</th>\n",
       "      <th>RAD_8</th>\n",
       "      <th>RAD_24</th>\n",
       "      <th>MEDV</th>\n",
       "      <th>log_MEDV</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>-0.419782</td>\n",
       "      <td>0.285654</td>\n",
       "      <td>-1.287909</td>\n",
       "      <td>-0.272599</td>\n",
       "      <td>-0.144217</td>\n",
       "      <td>0.413672</td>\n",
       "      <td>-0.120013</td>\n",
       "      <td>0.140214</td>\n",
       "      <td>-0.666608</td>\n",
       "      <td>-1.353192</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.159686</td>\n",
       "      <td>0.345176</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>-0.417339</td>\n",
       "      <td>-0.487292</td>\n",
       "      <td>-0.593381</td>\n",
       "      <td>-0.272599</td>\n",
       "      <td>-0.740262</td>\n",
       "      <td>0.194274</td>\n",
       "      <td>0.367166</td>\n",
       "      <td>0.557160</td>\n",
       "      <td>-0.987329</td>\n",
       "      <td>-0.475352</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>-0.101524</td>\n",
       "      <td>0.084104</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>-0.417342</td>\n",
       "      <td>-0.487292</td>\n",
       "      <td>-0.593381</td>\n",
       "      <td>-0.272599</td>\n",
       "      <td>-0.740262</td>\n",
       "      <td>1.282714</td>\n",
       "      <td>-0.265812</td>\n",
       "      <td>0.557160</td>\n",
       "      <td>-0.987329</td>\n",
       "      <td>-0.475352</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1.324247</td>\n",
       "      <td>1.266776</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>-0.416750</td>\n",
       "      <td>-0.487292</td>\n",
       "      <td>-1.306878</td>\n",
       "      <td>-0.272599</td>\n",
       "      <td>-0.835284</td>\n",
       "      <td>1.016303</td>\n",
       "      <td>-0.809889</td>\n",
       "      <td>1.077737</td>\n",
       "      <td>-1.106115</td>\n",
       "      <td>-0.036432</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1.182758</td>\n",
       "      <td>1.170822</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>-0.412482</td>\n",
       "      <td>-0.487292</td>\n",
       "      <td>-1.306878</td>\n",
       "      <td>-0.272599</td>\n",
       "      <td>-0.835284</td>\n",
       "      <td>1.228577</td>\n",
       "      <td>-0.511180</td>\n",
       "      <td>1.077737</td>\n",
       "      <td>-1.106115</td>\n",
       "      <td>-0.036432</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1.487503</td>\n",
       "      <td>1.373242</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 23 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "       CRIM        ZN     INDUS      CHAS       NOX        RM       AGE  \\\n",
       "0 -0.419782  0.285654 -1.287909 -0.272599 -0.144217  0.413672 -0.120013   \n",
       "1 -0.417339 -0.487292 -0.593381 -0.272599 -0.740262  0.194274  0.367166   \n",
       "2 -0.417342 -0.487292 -0.593381 -0.272599 -0.740262  1.282714 -0.265812   \n",
       "3 -0.416750 -0.487292 -1.306878 -0.272599 -0.835284  1.016303 -0.809889   \n",
       "4 -0.412482 -0.487292 -1.306878 -0.272599 -0.835284  1.228577 -0.511180   \n",
       "\n",
       "        DIS       TAX   PTRATIO    ...     RAD_2  RAD_3  RAD_4  RAD_5  RAD_6  \\\n",
       "0  0.140214 -0.666608 -1.353192    ...         0      0      0      0      0   \n",
       "1  0.557160 -0.987329 -0.475352    ...         1      0      0      0      0   \n",
       "2  0.557160 -0.987329 -0.475352    ...         1      0      0      0      0   \n",
       "3  1.077737 -1.106115 -0.036432    ...         0      1      0      0      0   \n",
       "4  1.077737 -1.106115 -0.036432    ...         0      1      0      0      0   \n",
       "\n",
       "   RAD_7  RAD_8  RAD_24      MEDV  log_MEDV  \n",
       "0      0      0       0  0.159686  0.345176  \n",
       "1      0      0       0 -0.101524  0.084104  \n",
       "2      0      0       0  1.324247  1.266776  \n",
       "3      0      0       0  1.182758  1.170822  \n",
       "4      0      0       0  1.487503  1.373242  \n",
       "\n",
       "[5 rows x 23 columns]"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# path to where the data lies\n",
    "#dpath = './data/'\n",
    "df = pd.read_csv(\"FE_boston_housing.csv\")\n",
    "\n",
    "#通过观察前5行，了解数据每列（特征）的概况\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "###  数据基本信息\n",
    "样本数目、特征维数\n",
    "每个特征的类型、空值样本的数目、数据类型"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "#df.info()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 数据准备"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true,
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# 从原始数据中分离输入特征x和输出y\n",
    "# 这里我们y有2个取值，原始的MEDV及其log1p之后的值\n",
    "col_y = [\"MEDV\",\"log_MEDV\"]\n",
    "y = pd.DataFrame(df,columns = col_y)\n",
    "\n",
    "X = df.drop([\"MEDV\", \"log_MEDV\"], axis = 1)\n",
    "\n",
    "#特征名称，用于后续显示权重系数对应的特征\n",
    "feat_names = X.columns"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "当数据量比较大时，可用train_test_split从训练集中分出一部分做校验集；\n",
    "样本数目较少时，建议用交叉验证。\n",
    "在线性回归中，留一交叉验证有简便计算方式。\n",
    "\n",
    "下面将训练数据分割成训练集和测试集，只是让大家对模型的训练误差、校验集上的测试误差估计、和测试集上的测试误差做个比较。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(404, 21)"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#将数据分割训练数据与测试数据\n",
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "# 随机采样20%的数据构建测试样本，其余作为训练样本\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=33, test_size=0.2)\n",
    "X_train.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3、确定模型类型"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 3.1 尝试缺省参数的线性回归"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>coef_log</th>\n",
       "      <th>coef_org</th>\n",
       "      <th>columns</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>0.524147</td>\n",
       "      <td>0.505543</td>\n",
       "      <td>RAD_24</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>0.176170</td>\n",
       "      <td>0.297984</td>\n",
       "      <td>RM</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>0.157229</td>\n",
       "      <td>0.184468</td>\n",
       "      <td>RAD_7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.094180</td>\n",
       "      <td>0.147403</td>\n",
       "      <td>ZN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>0.093588</td>\n",
       "      <td>0.147279</td>\n",
       "      <td>RAD_8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>0.058141</td>\n",
       "      <td>0.134516</td>\n",
       "      <td>RAD_3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>0.100203</td>\n",
       "      <td>0.088869</td>\n",
       "      <td>B</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.071025</td>\n",
       "      <td>0.074143</td>\n",
       "      <td>CHAS</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.048219</td>\n",
       "      <td>0.017002</td>\n",
       "      <td>INDUS</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>0.004709</td>\n",
       "      <td>-0.001742</td>\n",
       "      <td>AGE</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>-0.095117</td>\n",
       "      <td>-0.042599</td>\n",
       "      <td>RAD_4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>-0.051296</td>\n",
       "      <td>-0.056407</td>\n",
       "      <td>RAD_5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>-0.199881</td>\n",
       "      <td>-0.104955</td>\n",
       "      <td>CRIM</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>-0.175318</td>\n",
       "      <td>-0.176803</td>\n",
       "      <td>NOX</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>-0.236459</td>\n",
       "      <td>-0.178692</td>\n",
       "      <td>TAX</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>-0.157749</td>\n",
       "      <td>-0.200660</td>\n",
       "      <td>RAD_2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>-0.179215</td>\n",
       "      <td>-0.209666</td>\n",
       "      <td>PTRATIO</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>-0.175632</td>\n",
       "      <td>-0.272976</td>\n",
       "      <td>RAD_6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>-0.274028</td>\n",
       "      <td>-0.361874</td>\n",
       "      <td>DIS</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>-0.353311</td>\n",
       "      <td>-0.399165</td>\n",
       "      <td>RAD_1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>-0.542390</td>\n",
       "      <td>-0.459577</td>\n",
       "      <td>LSTAT</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    coef_log  coef_org  columns\n",
       "20  0.524147  0.505543   RAD_24\n",
       "5   0.176170  0.297984       RM\n",
       "18  0.157229  0.184468    RAD_7\n",
       "1   0.094180  0.147403       ZN\n",
       "19  0.093588  0.147279    RAD_8\n",
       "14  0.058141  0.134516    RAD_3\n",
       "10  0.100203  0.088869        B\n",
       "3   0.071025  0.074143     CHAS\n",
       "2   0.048219  0.017002    INDUS\n",
       "6   0.004709 -0.001742      AGE\n",
       "15 -0.095117 -0.042599    RAD_4\n",
       "16 -0.051296 -0.056407    RAD_5\n",
       "0  -0.199881 -0.104955     CRIM\n",
       "4  -0.175318 -0.176803      NOX\n",
       "8  -0.236459 -0.178692      TAX\n",
       "13 -0.157749 -0.200660    RAD_2\n",
       "9  -0.179215 -0.209666  PTRATIO\n",
       "17 -0.175632 -0.272976    RAD_6\n",
       "7  -0.274028 -0.361874      DIS\n",
       "12 -0.353311 -0.399165    RAD_1\n",
       "11 -0.542390 -0.459577    LSTAT"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 线性回归\n",
    "#class sklearn.linear_model.LinearRegression(fit_intercept=True, normalize=False, copy_X=True, n_jobs=1)\n",
    "from sklearn.linear_model import LinearRegression\n",
    "\n",
    "# 1.使用默认配置初始化学习器实例\n",
    "lr = LinearRegression()\n",
    "\n",
    "# 2.用训练数据训练模型参数\n",
    "lr.fit(X_train, y_train)\n",
    "\n",
    "# 3. 用训练好的模型对测试集进行预测\n",
    "y_test_pred_lr = lr.predict(X_test)\n",
    "y_train_pred_lr = lr.predict(X_train)\n",
    "\n",
    "\n",
    "# 看看各特征的权重系数，系数的绝对值大小可视为该特征的重要性\n",
    "fs = pd.DataFrame({\"columns\":list(feat_names), \"coef_org\":list((lr.coef_[0,:].T)),\"coef_log\":list((lr.coef_[1,:].T))})\n",
    "fs.sort_values(by=['coef_org'],ascending=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 3.1.1 模型评价"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The r2 score of LinearRegression on test with original MEDV is 0.693978981051\n",
      "The r2 score of LinearRegression on train with original MEDV is 0.754914643687\n",
      "The r2 score of LinearRegression on test with log MEDV is 0.708305459026\n",
      "The r2 score of LinearRegression on train  with log MEDV is 0.811201041743\n"
     ]
    }
   ],
   "source": [
    "# 使用r2_score评价模型在测试集和训练集上的性能，并输出评估结果\n",
    "#测试集\n",
    "print 'The r2 score of LinearRegression on test with original MEDV is', r2_score(y_test.iloc[:,0], y_test_pred_lr[:,0])\n",
    "#训练集\n",
    "print 'The r2 score of LinearRegression on train with original MEDV is', r2_score(y_train.iloc[:,0], y_train_pred_lr[:,0])\n",
    "\n",
    "# y取log\n",
    "#测试集\n",
    "print 'The r2 score of LinearRegression on test with log MEDV is', r2_score(y_test.iloc[:,1], y_test_pred_lr[:,1])\n",
    "#训练集\n",
    "print 'The r2 score of LinearRegression on train  with log MEDV is', r2_score(y_train.iloc[:,01], y_train_pred_lr[:,1])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "对y（价格）取log后，r2 score略变好。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.14"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
